0.0. Imports¶

0.1. Libraries¶

In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import plotly.express as px
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import HTML
import plotly.graph_objects as go
from geopy.geocoders import Nominatim
from matplotlib import gridspec

0.2. Helper Functions¶

In [2]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline

    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )


    sns.set()
jupyter_settings()
%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib

0.3. Loading Data¶

In [3]:
data = pd.read_csv('../data/kc_house_data.csv')

1.0. Data Dascription¶

In [4]:
data.head()
Out[4]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.00 3 1.00 1180 5650 1.00 0 0 ... 7 1180 0 1955 0 98178 47.51 -122.26 1340 5650
1 6414100192 20141209T000000 538000.00 3 2.25 2570 7242 2.00 0 0 ... 7 2170 400 1951 1991 98125 47.72 -122.32 1690 7639
2 5631500400 20150225T000000 180000.00 2 1.00 770 10000 1.00 0 0 ... 6 770 0 1933 0 98028 47.74 -122.23 2720 8062
3 2487200875 20141209T000000 604000.00 4 3.00 1960 5000 1.00 0 0 ... 7 1050 910 1965 0 98136 47.52 -122.39 1360 5000
4 1954400510 20150218T000000 510000.00 3 2.00 1680 8080 1.00 0 0 ... 8 1680 0 1987 0 98074 47.62 -122.05 1800 7503

5 rows × 21 columns

1.1. Data Dimensions¶

In [5]:
data.shape
Out[5]:
(21613, 21)

1.2. Data Types¶

In [6]:
data.dtypes
Out[6]:
id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

1.3. Change Dtypes¶

In [7]:
data['date'] = pd.to_datetime(data['date'])
data['floors'] = data['floors'].astype(int)

1.4. Check and Fiilout NA¶

In [8]:
data.isna().sum()
Out[8]:
id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

1.5. Descriptive Statistics¶

In [9]:
cat_attributes = data[['view', 'condition', 'waterfront',  'grade']]
num_attributes = data.drop(['view', 'condition', 'waterfront',  'grade'], axis = 1 )
In [10]:
num_attributes.describe().T
Out[10]:
count mean std min 25% 50% 75% max
id 21613.00 4580301520.86 2876565571.31 1000102.00 2123049194.00 3904930410.00 7308900445.00 9900000190.00
price 21613.00 540088.14 367127.20 75000.00 321950.00 450000.00 645000.00 7700000.00
bedrooms 21613.00 3.37 0.93 0.00 3.00 3.00 4.00 33.00
bathrooms 21613.00 2.11 0.77 0.00 1.75 2.25 2.50 8.00
sqft_living 21613.00 2079.90 918.44 290.00 1427.00 1910.00 2550.00 13540.00
sqft_lot 21613.00 15106.97 41420.51 520.00 5040.00 7618.00 10688.00 1651359.00
floors 21613.00 1.45 0.55 1.00 1.00 1.00 2.00 3.00
sqft_above 21613.00 1788.39 828.09 290.00 1190.00 1560.00 2210.00 9410.00
sqft_basement 21613.00 291.51 442.58 0.00 0.00 0.00 560.00 4820.00
yr_built 21613.00 1971.01 29.37 1900.00 1951.00 1975.00 1997.00 2015.00
yr_renovated 21613.00 84.40 401.68 0.00 0.00 0.00 0.00 2015.00
zipcode 21613.00 98077.94 53.51 98001.00 98033.00 98065.00 98118.00 98199.00
lat 21613.00 47.56 0.14 47.16 47.47 47.57 47.68 47.78
long 21613.00 -122.21 0.14 -122.52 -122.33 -122.23 -122.12 -121.31
sqft_living15 21613.00 1986.55 685.39 399.00 1490.00 1840.00 2360.00 6210.00
sqft_lot15 21613.00 12768.46 27304.18 651.00 5100.00 7620.00 10083.00 871200.00

2.0. Feature Engineering¶

In [11]:
data2 = data.copy()

2.1. Standard and High Standard feature¶

In [12]:
data2['standard'] = data2['price'].apply(lambda x: 'high_standard' if x >= 540000 else 'standard')

2.2. House Age¶

In [13]:
data2['house_age'] = data2['date'].apply(lambda x: "new_house" if x >= pd.to_datetime('2014/01/01', format = "%Y/%m/%d") else "old_house")

2.3. Dormitory Type¶

In [14]:
data2['dormitory_type'] = data2['bedrooms'].apply(lambda x: "studio" if x == 1 else "apartment" if x == 2 else "house" if x > 2 else "NA")

2.4. Condition Type¶

In [15]:
data2['condition_type'] = data2['condition'].apply(lambda x: "bad" if x <= 2 else "regular" if ( x == 3 ) | ( x == 4) else "good" )

2.5. Year¶

In [16]:
data2['year'] = data2['date'].dt.year

2.6. Week¶

In [17]:
data2['week'] = data2['year'].astype(str) + " " + (data2['date'].dt.isocalendar().week).astype(str) 

2.7. Level¶

In [18]:
data2['level'] = data2['price'].apply(lambda x: 0 if x < 321950 else
                                                1 if (x >= 321950) & (x < 450000) else
                                                2 if (x >= 450000) & (x < 645000) else
                                                3
                                     )

3.0. Data Filtering¶

In [19]:
data3 = data2.copy()

3.1. Bedrooms¶

In [20]:
data3 = data3[data3['bedrooms'] < 33]

4.0. CEO Requests¶

In [21]:
data4 = data3.copy()

4.1. Report sorted by price with the following information:¶

  • id
  • date
  • no or bedrooms
  • lot size
  • price
  • Property classification
In [22]:
report = data4[['id', 'date','bedrooms','sqft_lot','price','standard']].sort_values('price', ascending = False).reset_index()
report.head()
Out[22]:
index id date bedrooms sqft_lot price standard
0 7252 6762700020 2014-10-13 6 27600 7700000.00 high_standard
1 3914 9808700762 2014-06-11 5 37325 7062500.00 high_standard
2 9254 9208900037 2014-09-19 6 31374 6885000.00 high_standard
3 4411 2470100110 2014-08-04 5 35069 5570000.00 high_standard
4 1448 8907500070 2015-04-13 5 23985 5350000.00 high_standard
In [23]:
report.to_csv('../data/report_1.csv', index = False)

4.2. Whats the price change over time?¶

In [24]:
plot_ano = data4.groupby('date')['price'].mean().reset_index()
plt.rcParams["figure.figsize"] = (25,8)
plt.title("Price change over time")
sns.lineplot(x = data4['date'], y = data4['price'])
Out[24]:
<AxesSubplot: title={'center': 'Price change over time'}, xlabel='date', ylabel='price'>

4.3. Whats the price change over the weeks?¶

In [25]:
data1_week = data4.sort_values('week')
plt.title("Price change over weeks")
plt.xticks(fontsize = 7, rotation = 90)
sns.barplot(x = data1_week['week'], y = data1_week['price'])
Out[25]:
<AxesSubplot: title={'center': 'Price change over weeks'}, xlabel='week', ylabel='price'>

4.4. Num of bedrooms¶

In [26]:
plt.figure(figsize = (8,5) )
sns.histplot(data = data4, x = 'bedrooms')
Out[26]:
<AxesSubplot: xlabel='bedrooms', ylabel='Count'>

4.5. AVG price by Year built¶

In [27]:
preco_ano = data4[['yr_built','price']].groupby('yr_built').mean().reset_index()
fig = go.Figure()
fig.add_trace(go.Scatter(x=preco_ano['yr_built'], y=preco_ano['price'],
                    line_shape='spline'))
fig.update_layout(title_text='AVG Price by YR Built',title_x=0.5,title_y=0.9, width = 800, height = 600)

4.6. AVG price by Dormitory Type¶

In [28]:
media_dorm = data4[['dormitory_type','price']].groupby('dormitory_type').mean().reset_index()
fig = px.bar(media_dorm, y = 'price', x = 'dormitory_type', text_auto = '.2s', width = 800, height = 500)
fig.update_layout(title_text='Preço por tipo de dormitório',title_x=0.5,title_y=0.95,width = 800, height = 600)
fig.update_traces(textfont_size=12)

fig.show()

4.7. Price by Year Renovated¶

In [29]:
media_renovacao = data4[['yr_renovated','price']].groupby('yr_renovated').mean().reset_index()
media_renovacao = media_renovacao[media_renovacao['yr_renovated']>=1930]
fig = go.Figure()
fig.add_trace(go.Scatter(x=media_renovacao['yr_renovated'], y=media_renovacao['price'],
                    line_shape='spline'))
fig.update_layout(title_text='Price by YR Renovated',title_x=0.5,title_y=0.9, width = 800, height = 600)

4.7. AVG price by Year Built and Dormitory type¶

In [30]:
media_ano_dorms = data4[['yr_built','dormitory_type','price']]
media_ano_dorms = pd.pivot_table(media_ano_dorms, values='price', index=['yr_built'], columns=['dormitory_type'], aggfunc=np.sum).fillna(0).reset_index().groupby('yr_built').sum()
media_ano_dorms[['NA','apartment','house', 'studio']] = media_ano_dorms[['NA','apartment','house', 'studio']].mask(media_ano_dorms[['NA','apartment','house', 'studio']] > 300000000, 30000000)

fig = px.imshow(media_ano_dorms.values,
                labels=dict(x="dormitory_type", y="yr_built", color="Price"),
                x = media_ano_dorms.columns,
                y = media_ano_dorms.index,
                color_continuous_scale='brwnyl'
               )

fig.update_xaxes(side="top")

fig.update_layout(width = 800, height = 800)
fig.show()

4.8. Houses by level¶

In [33]:
plt.figure(figsize = (8,5) )
sns.histplot(data = data4, x = 'level', )
Out[33]:
<AxesSubplot: xlabel='level', ylabel='Count'>
In [38]:
data = pd.read_csv('../data/kc_house_data.csv')
houses = data[['id','lat','long','price']].copy()

# Define level
houses['level'] = houses['price'].apply(lambda x: 0 if x < 321950 else
                                              1 if (x >= 321950) & (x < 450000) else
                                              2 if (x >= 450000) & (x < 645000) else
                                              3
                                    )

houses['level'] = houses['level'].astype( int )

fig = px.scatter_mapbox( houses,
                         lat = 'lat',
                         lon = 'long',
                         color = 'level',
                         size = 'price',
                         color_continuous_scale = px.colors.cyclical.IceFire,
                         size_max = 15,
                         zoom = 10 )

fig.update_layout( mapbox_style = 'open-street-map' )
fig.update_layout( height = 800, margin = {'r':0, 't':0, 'l':0, 'b':0})

#fig.show()
fig.write_html("../results/kc_house_map.html")
fig.show()